package cn.doitedu.dmp.crawler;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;

/**
 * app信息抽取
 */
public class AzappInfoExtract {
    public static void main(String[] args) throws Exception {

        BufferedReader br = new BufferedReader(new FileReader("crawler/data/applist_addr.txt"));

        BufferedWriter bw = new BufferedWriter(new FileWriter("crawler/data/appinfo.txt"));


        String addr = null;
        while(StringUtils.isNotBlank(addr=br.readLine())){

            String[] split = addr.split(",");

            Document doc = Jsoup.connect("http://www.anzhi.com" + split[0]).get();

            Element ul = doc.getElementsByClass("app_list border_three").get(0).getElementsByTag("ul").get(0);
            Elements lis = ul.getElementsByTag("li");
            for (Element li : lis) {
                Element app_info = li.getElementsByClass("app_info").get(0);
                Elements aTag = app_info.getElementsByTag("a");
                Elements desc = app_info.getElementsByTag("p");

                String appName = aTag.attr("title");

                String href = aTag.attr("href");
                String s = href.split("_")[1];
                String appId = s.substring(0, s.length() - 5);

                String appDesc = desc.text();

                bw.write(appId+"\001"+appName+"\001"+split[1]+"\001"+appDesc);
                bw.newLine();

            }

        }

        br.close();
        bw.close();

    }
}
